#################################################################################### 
####################################################################################
####################################################################################
# Appendix D: Tables 12--13
####################################################################################
####################################################################################
library(here)
library(kableExtra)
library(RItools)
library(readstata13)
library(dplyr)
####################################################################################
####################################################################################
## Read in survey data (from: analysis_balance.R)
aej = read.csv(here("data/AEJ2018_HHmain_id_attrition.csv"))
age = read.dta13(here("data/AEJ2018_HHroster_ab18.dta"))
a = read.dta13(here("data/AEJ2018_HHmembers_A.dta"))
b = read.dta13(here("data/AEJ2018_HHmembers_B.dta"))
c = read.dta13(here("data/AEJ2018_HHmembers_C.dta"))
d = Reduce(function(x,y) merge(x = x, y = y, by = "hhid", all = T), 
           list(a,b,c))
rm(a,b,c)
# Replace HHID where it is missing
d[is.na(d$villageid),]$villageid = d[is.na(d$villageid),]$villageid.x
d[is.na(d$villageid),]$villageid = d[is.na(d$villageid),]$villageid.y
d[is.na(d$treatment),]$treatment = d[is.na(d$treatment),]$treatment.x
d[is.na(d$treatment),]$treatment = d[is.na(d$treatment),]$treatment.y
d[is.na(d$branchid),]$branchid = d[is.na(d$branchid),]$branchid.x
d[is.na(d$branchid),]$branchid = d[is.na(d$branchid),]$branchid.y
# Calculate household size across columns
d$hhsize = rowSums(d[,c("HHmembers_A", "HHmembers_B", "HHmembers_C")], na.rm=TRUE)

# Rescale age to age at baseline
age = age[!is.na(age$HHroster_ab18_relation) & age$HHroster_ab18_relation == "Household Head",]
age$age_hhh = age$HHroster_ab18_age - 3 
age$edu_hhh = age$HHroster_ab18_education

# Merge age and education data with household size dataset
hh_dat = merge(age, d[, c("hhid", "hhsize", "treatment", "branchid")], by = "hhid", all = T)
rm(age,d)

# Bring in under-1 mortality
imr = read.dta13(here("data/AEJ2018_infant_mortality_computation.dta"))
imr = imr %>% group_by(hhid, treatment, branchid) %>% summarise(death_under1 = sum(death_under1))
hh_dat = merge(hh_dat, imr[, c("hhid", "death_under1", "treatment", "branchid")], by = "hhid", all = T)

# Identify control villages receiving the treatment
hh_dat$attrition = 0
hh_dat[hh_dat$villageid %in% aej[aej$attrition == 1,]$villageid,]$attrition = 1
hh_dat[is.na(hh_dat$treatment.x),]$treatment.x = hh_dat[is.na(hh_dat$treatment.x),]$treatment.y
hh_dat[is.na(hh_dat$treatment.x),]$treatment.x = hh_dat[is.na(hh_dat$treatment.x),]$treatment
hh_dat[is.na(hh_dat$branchid.x),]$branchid.x = hh_dat[is.na(hh_dat$branchid.x),]$branchid.y
hh_dat[is.na(hh_dat$branchid.x),]$branchid.x = hh_dat[is.na(hh_dat$branchid.x),]$branchid

# Merge in remaining HH-level variables
table(aej$hhid %in% hh_dat$hhid)
aej = aej[,! colnames(aej) %in% c("treatment", "branchid")]

hh_dat = merge(hh_dat, aej, by = "hhid", all.x = T)

# Slice up data into the normal sample (without attrition) and the control sample (with attrition)
hhc = hh_dat[hh_dat$treatment.x == 0,]
hh = hh_dat[hh_dat$attrition.x == 0,]


####################################################################################
####################################################################################
## Table D.12

hh = hh %>% group_by(villageid.x) %>% summarise_all(funs(mean(., na.rm = TRUE)))
hhc = hhc %>% group_by(villageid.x) %>% summarise_all(funs(mean(., na.rm = TRUE)))

# Sample Balance
model = formula("treatment.x ~ hhsize + age_hhh + edu_hhh + death_under1 + HH_TV + HH_radio + HH_anyone_phone + HH_clothes +
                HH_years_invillage + HH_meals + HH_electricity")
balance_test = xBalance(model,
                        strata = factor(hh$branchid.x), #list(noblocks=NULL)
                        #cluster = factor(dat$village_id), 
                        data = hh, na.rm = T,
                        report=c("adj.means","adj.mean.diffs","z.scores","chisquare.test","p.values"))
rownames(balance_test$results) = c("Household Size","Head of Household Age","Head of Household Education", "Mortality Under 1", "Years in Village",
                                   "TV Ownership", "Radio Ownership", "Phone Ownership", "Clothing Ownership", "Meals with Meat", "Electricity")
rownames(balance_test$overall) = c("Overall Test Statistic")

kable(round(balance_test$results,2),align = "c", booktabs = T, caption = "Household-Level Block-Adjusted Omnibus Balance Test (Original Study)", 
      col.names = c("Control Mean", "Treatment Mean", "Difference","Z-score","P-value"), format = "latex")
kable(round(balance_test$overall,2), align = "c", row.names = T,
      col.names = c("Chi-Sq", "Df", "P-value"), format = "latex")

####################################################################################
####################################################################################
## Table D.13

# Control group balance
model = formula("attrition.x ~ hhsize + age_hhh + edu_hhh + death_under1 + HH_TV + HH_radio + HH_anyone_phone + HH_clothes +
                HH_years_invillage + HH_meals + HH_electricity")
balance_test = xBalance(model,
                        strata = factor(hhc$branchid.x), #list(noblocks=NULL)
                        #cluster = factor(dat$village_id), 
                        data = hhc, na.rm = T,
                        report=c("adj.means","adj.mean.diffs","z.scores","chisquare.test","p.values"))
rownames(balance_test$results) = c("Household Size","Head of Household Age","Head of Household Education", "Mortality Under 1", "Years in Village",
                                   "TV Ownership", "Radio Ownership", "Phone Ownership", "Clothing Ownership", "Meals with Meat", "Electricity")
rownames(balance_test$overall) = c("Overall Test Statistic")

kable(round(balance_test$results,2),align = "c", booktabs = T, caption = "Household-Level Block-Adjusted Omnibus Balance Test (Original Study Control)", 
      col.names = c("Control Mean", "Treatment Mean", "Difference","Z-score","P-value"), format = "latex")
kable(round(balance_test$overall,2), align = "c", row.names = T,
      col.names = c("Chi-Sq", "Df", "P-value"), format = "latex")

rm(aej, balance_test, dat, hh, hh_dat, hhc, model, covariates)